# Figure_A11.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file produces Figure A11 in the appendix to the article: "Missing Data 
# in the ANES and GSS." If the EDUC_DIFF argument is set to TRUE, it will 
# instead produce Figure A12: "Differences in Missingness by Education Level."


if (interactive()) {
  EDUC_DIFF <- FALSE   
} else {
  clArgs <- commandArgs(trailingOnly = TRUE)
  if (length(clArgs) <= 0) { 
    writeLines(paste0("clArgs has length ", length(clArgs), ".  This length is less than 1, so the script is stopping."))
    stop()
  }
  EDUC_DIFF <- as.logical(clArgs[1])
}


library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))    # for merge_fac()
library(car)        # for Recode()
library(dplyr)      # for %>%
library(forcats)    # for fct_relabel()
library(grid)
library(haven)      # for read_spss()
library(lattice)
library(reshape2)   # for melt()

# Install version 0.9.1 of the "gridextra" package. Future versions dropped 
# functions that are used in this file, e.g., grid.colorstrip().
#   If you prefer, you can download the source file directly from  
# https://cran.r-project.org/src/contrib/Archive/gridExtra/gridExtra_0.9.1.tar.gz.
education_BJPS_lib <- 'packageLibrary' 
if (! 'gridExtra' %in% installed.packages(lib.loc = education_BJPS_lib)[, 'Package']) {

  if (!dir.exists(education_BJPS_lib)) dir.create(education_BJPS_lib)

  if (Sys.info()['sysname'] == 'Windows') {
    install.packages(
      pkgs    = 'gridExtra_0.9.1_source/gridExtra_0.9.1.zip',
      repos  = NULL,
      lib     = normalizePath(education_BJPS_lib),
      verbose = TRUE)
  } else {
    install.packages(
      pkgs    = 'gridExtra_0.9.1_source/gridExtra_0.9.1.tar.gz', 
      repos   = NULL, 
      type    = "source", 
      lib     = normalizePath(education_BJPS_lib),
      verbose = TRUE)
  }
}
library(gridExtra, lib.loc = normalizePath(education_BJPS_lib))


source('functions/tabNA.R')
dirOutput <- 'float_output/'



##############################################################################
# LOAD ANES AND GSS DATA
##############################################################################
ANES <- readRDS('data/ANES_missingness.RDS')    
GSS_cumulative_filename <- 'GSS7218_R1.DTA'
if (! file.exists(paste0("data/", GSS_cumulative_filename))) {
  GSS_cumulative <- tempfile(fileext = '.zip')
  download.file(
    url      = 'http://www.gss.norc.org/documents/stata/GSS_stata.zip', 
    destfile = GSS_cumulative)
  unzip(GSS_cumulative, GSS_cumulative_filename, exdir = 'data')
}
GSS <- read_dta(paste0("data/", GSS_cumulative_filename)) %>%
  select(
    year,
    reg16,
    educ,
    degree,
    paeduc,
    maeduc,
    padeg,
    madeg,
    cohort,
    id,
    age,
    sex,
    race,
    born,
    eqwlth,
    goveqinc,
    eqincome,
    helppoor,
    natfare,
    marital,
    realrinc,
    incdef,
    prestige,
    papres16,
    papres80,
    goodlife,
    starts_with('pay'),
    starts_with('giv'),
    getahead,
    welfare1,
    wordsum,    
    occ,
    occ80) %>%
  as_factor() %>%
  mutate_if(is.factor, fct_relabel, toupper)  # change factor levels to upper case



##############################################################################
# CREATE THE DATA FRAME TO PLOT
##############################################################################
ANES$educ       <- pmin(ANES$educ, 13) 
GSS$educ        <- Recode(GSS$educ, 'c("IAP", "DK", "NA")=NA', as.factor = FALSE) %>%   
  pmin(.,  13)

ANES$educLevel  <- Recode(
  var       = ANES$VCF0140, 
  recodes   = '1="max8"; 2="max12"; 3:4="HSdiploma"; 5="some college"; 6="BA or higher"; else=NA', 
  as.factor = TRUE) %>%
  ordered(., levels = c('max8', 'max12', 'HSdiploma', 'some college', 'BA or higher'))
ANES$HSgrad     <- ANES$educLevel >= 'HSdiploma'
ANES$college    <- ANES$educLevel >= 'BA or higher'

ANES$year.fac   <- factor(ANES$yearInt)
GSS$year.fac    <- factor(GSS$year)
GSS$HSgrad      <- GSS$educ >= 12 
GSS$college     <- ordered(GSS$degree, levels=c('LT HIGH SCHOOL', 'HIGH SCHOOL', 'JUNIOR COLLEGE', 'BACHELOR', 'GRADUATE')) >= 'BACHELOR'
years.combined  <- sort(unique(c(levels(ANES$year.fac), levels(GSS$year.fac))))



# The 1994 and 2002 ANES include a lot of respondents who first appeared in
# the 1992 and 2000 ANES.  The code below revises ANES$educ so that the 
# 1994 and 2002 education values for these variables are coded as NA, not 
# missing.  The point is to prevent the missingness figure from misleadingly 
# suggesting enormous missingness on the 1994 and 2002 education variables.  
ANES$educ[is.na(ANES$educ) & ANES$yearInt == 1992 & ANES$ID.unique %in% 19900000:19909999] <- 99
ANES$educ[is.na(ANES$educ) & ANES$yearInt == 1994 & ANES$ID.unique %in% 19920000:19929999] <- 99
ANES$educ[is.na(ANES$educ) & ANES$yearInt == 2002 & ANES$ID.unique %in% 20000000:20009999] <- 99
ANES$educ[is.na(ANES$educ)] <- 12345

# Calculate non-missingness rate for ANES education data.
sum(ANES$educ[ANES$yearInt >= 1970]<=13) / sum(ANES$educ[ANES$yearInt >= 1970]!=99)

# The "redistribution to the poor (2)" GSS item is built from two variables,  
# so the preparation of this variable is a little more complex.
GSS$goveqinc <- Recode(GSS$goveqinc, '"IAP" = NA')
GSS$eqincome <- Recode(GSS$eqincome, '"IAP" = NA')
GSS$goveqinc <- merge_fac(c('goveqinc', 'eqincome'), envir = as.environment(GSS))
levels(GSS$goveqinc) <- c(levels(GSS$goveqinc), 'IAP')
GSS$goveqinc[is.na(GSS$goveqinc)] <- 'IAP'


# CREATE DATA FRAME FOR MISSINGNESS IN THE ANES
missingness.data.ANES <- data.frame(
  year = as.numeric(levels(ANES$year.fac)),   
  years.of..education..ANES = tabNA(
    ANES$educ,              
    NA.codes      = '99',
    missing.codes = '12345',
    envir         = as.environment(ANES),
    show.educ     = EDUC_DIFF), 
  guarantee..SOL..ANES = tabNA(
    ANES$VCF0809, 
    NA.codes      = '0', 
    missing.codes = '9', 
    envir         = as.environment(ANES), 
    show.educ     = EDUC_DIFF),
  health.care..ANES = tabNA(
    ANES$VCF0806, 
    NA.codes      = '0', 
    missing.codes = '9', 
    envir         = as.environment(ANES), 
    show.educ     = EDUC_DIFF))

if (EDUC_DIFF) {
  # Make the columns in missingness.data.ANES reflect the differences in 
  # missingness between low- and high-education respondents.  
  missingness.data.ANES <- data.frame(
    year                      = missingness.data.ANES$year,
    years.of..education..ANES = missingness.data.ANES[, 2] - missingness.data.ANES[,  4],
    guarantee..SOL..ANES      = missingness.data.ANES[, 5] - missingness.data.ANES[,  7],
    health.care..ANES         = missingness.data.ANES[, 8] - missingness.data.ANES[, 10])
}

# We don't use ANES data after 2008 (because of restrictions on state-when-
# young data that were enacted after the 2008 ANES).
missingness.data.ANES[missingness.data.ANES$year > 2008,] <- NaN



# CREATE DATA FRAME FOR MISSINGNESS IN THE GSS
missingness.data.GSS <- data.frame(
  year                      = sort(unique(GSS$year)),   
  years.of..education..GSS  = tabNA(GSS$educ,     missing.codes=98:99,                                    envir=as.environment(GSS), show.educ = EDUC_DIFF),
  redistrib..to.poor.1..GSS = tabNA(GSS$eqwlth,   missing.codes='c("DK", "NA")',          NA.codes='IAP', envir=as.environment(GSS), show.educ = EDUC_DIFF),
  redistrib..to.poor.2..GSS = tabNA(GSS$goveqinc, missing.codes='c("CANT CHOOSE", "DK")', NA.codes='IAP', envir=as.environment(GSS), show.educ = EDUC_DIFF),
  welfare..GSS              = tabNA(GSS$natfare,  missing.codes='c("DK", "NA")',          NA.codes='IAP', envir=as.environment(GSS), show.educ = EDUC_DIFF),
  help.poor..GSS            = tabNA(GSS$helppoor, missing.codes='c("DK", "NA")',          NA.codes='IAP', envir=as.environment(GSS), show.educ = EDUC_DIFF))

if (EDUC_DIFF) {
  # Make the columns in missingness.data.GSS reflect the differences in 
  # missingness between low- and high-education respondents.  
  missingness.data.GSS <- data.frame(
    year                      = missingness.data.GSS$year,
    years.of..education..GSS  = missingness.data.GSS[,  2] - missingness.data.GSS[,  4],
    redistrib..to.poor.1..GSS = missingness.data.GSS[,  5] - missingness.data.GSS[,  7],
    redistrib..to.poor.2..GSS = missingness.data.GSS[,  8] - missingness.data.GSS[, 10],
    welfare..GSS              = missingness.data.GSS[, 11] - missingness.data.GSS[, 13],
    help.poor..GSS            = missingness.data.GSS[, 14] - missingness.data.GSS[, 16])
}

# The GSS year-at-14 data aren't available before 1978, so I can't use any GSS
# data before 1978.  In this code, I ensure that all of the pre-1978 GSS cells
# in the figure contain diagonal lines, indicating that no data were available
# for those years.
#   We don't use GSS data after 2012 because, as of this writing, the GSS 
# hasn't made the restricted state-of-residence-when-young variable available
# for years after 2012.
missingness.data.GSS[missingness.data.GSS$year < 1978 | missingness.data.GSS$year > 2012,] <- NaN


# Make both year variables into factors that have the same comprehensive set 
# of levels.  I need to do this so that I can merge the data frames well.
missingness.data.ANES$year <- ordered(missingness.data.ANES$year, levels = years.combined)
missingness.data.GSS$year  <- ordered(missingness.data.GSS$year,  levels = years.combined)
missingness.data <- merge(missingness.data.ANES, missingness.data.GSS, by='year', all=TRUE)
missingness.data <- missingness.data[, c(1, 2, 5, 6, 7, 3, 4, 8, 9)]  # reorder columns

# Eliminate years before any outcome question was asked.  
missingness.data <- subset(missingness.data, year>=1970)

# Eliminate education variables if we are plotting differences in nonresponse
# between different education levels. In that case, it doesn't make sense to
# plot nonresponse on the education variables themselves.
if (EDUC_DIFF) {
  missingness.data <- missingness.data[, !grepl('^(HS|years.of..education)', colnames(missingness.data))]
}

# Put data into form for plotting.  
dataToPlot <- melt(
  missingness.data, 
  id.vars    = "year",
  value.name = "percent.missing")
dataToPlot <- dataToPlot[order(dataToPlot$year, dataToPlot$variable),]
dataToPlot$panel.num <- 1:nrow(dataToPlot)  
rownames(dataToPlot) <- NULL
dataToPlot$percent.missing[dataToPlot$percent.missing < 0] <- 0



##############################################################################
# PRELIMINARIES FOR PRINTING THE FIGURE
##############################################################################
filenameStem  <- if (EDUC_DIFF) 'Figure_A12' else 'Figure_A11'  
PDFtitle         <- 'Missingness in the ANES and GSS variables'
PS_width          <- 9.5
PS_height         <- 12.0
postscriptBackground    <- 'transparent'

# DETERMINE PANEL LAYOUT
ncol           <- ncol(missingness.data) - 1 
nrow           <- nrow(missingness.data)  # one row per year
panelLayout    <- c(ncol, nrow)           # columns, then rows

# SET PANEL DIMENSIONS
panelHeight    <- if (EDUC_DIFF) list(x = 6.00/nrow, units = "in") else list(x = 6.25/nrow, units = "in")
granularity    <- unit(2.5, "mm")                    # horizontal distance between diagonal lines.  Smaller numbers make denser panels. 
panelWidth     <- list(x = 5.80/ncol, units = "in")  # 6.5" for the figure, and we leave some space for year labels on the side(s) of the figure 
if (EDUC_DIFF) { panelWidth$x <- panelWidth$x - .05 }  

# OTHER PRELIMINARIES
plotLineColor    <- 'black'
plotLineWidth    <- 1
panelBorderCol   <- 'transparent'
panelNumberCol   <- grey(.2)                     # color of number of each panel; lower values are darker
panelborderwidth <- 0
xBetween         <- 0                            # space between columns
yBetween         <- 0                            # space between rows
panel.xlim       <- NULL                        
panel.ylim       <- NULL  
xaxtextsize      <-  .8                          # cex
yAxisTextSize    <-  .75                         # cex
axisTickSize     <- .4
xaxs             <- list(draw        = FALSE, 
                         labels      = c(1920, 1958, 1995), 
                         at          = c(1920, 1958, 1995), 
                         tck         = c(axisTickSize, 0), 
                         col         = panelBorderCol, 
                         cex         = xaxtextsize,
                         alternating = 1, 
                         relation    = 'free', 
                         axs         = 'i')      # axs='i' means that there is no padding around the xlimits
yaxs             <- list(draw=FALSE,
                         labels      = c(2, 6, 10), 
                         at          = c(2, 6, 10),
                         tck         = c(axisTickSize, 0),
                         col         = panelBorderCol,
                         cex         = yAxisTextSize,
                         alternating = 1,
                         relation    = 'free',   # but the ylim will constrain all panels to same height
                         rot         = 90,
                         axs         = 'i')
                    


                       
##############################################################################
# PANEL FUNCTIONS
##############################################################################
missingness.panel <- function(subscripts, y, ...) { 

  # DRAW PANELS FOR WHICH THE QUESTION WAS NOT ASKED
  if (is.na(y)) {
  }

  # DRAW PANELS FOR WHICH THE QUESTION WAS ASKED
  else {
    grid.polygon(
      x  = c(0,1,1,0,0), 
      y  = c(0,0,1,1,0), 
      gp = gpar(
        col  = 'transparent', 
        fill = if (is.na(y)) { NULL } else { gray(1-y) }, 
        lwd  = 5.5))     
  }

  # CLEAN UP PANELS A BIT
  # In a few cases, the cell for a question-year in which the question was not
  # asked appears immediately to the left of a question-year in which the 
  # question -was- asked.  In these cases, a tiny diagonal line segment 
  # appears in the lower right-hand corner of the "NA" cell.  Its appearance 
  # is disconcerting.  This code places a thin white square over it.  
  if ( is.na(y) & 
      !is.na(dataToPlot$percent.missing[subscripts + 1]) &
      !is.na(dataToPlot$percent.missing[subscripts + 8])) {
    grid.polygon(
      x  = c(0.98, 1, 1.00, 0.98, 0.98), 
      y  = c(0,    0, 0.07, 0.07, 0.00), 
      gp = gpar(
        col  = 'transparent', 
        fill = 'white'))         
  }
  
  
  # ADD TEXT LABELS TO ROWS
  if (panel.number()%%ncol == 1) {         # LHS of figure
    grid.text(
      label = dataToPlot$year[subscripts], 
      x     = -.1, 
      y     =  .45, 
      just  = 'right', 
      gp    = gpar(cex = yAxisTextSize))
  }
  else if (panel.number()%%ncol == 0) {
    grid.text(
      label = dataToPlot$year[subscripts], # RHS of figure
      x     = 1.1, 
      y     =  .45, 
      just  = 'left', 
      gp    = gpar(cex = yAxisTextSize))
  }
  
    
  # ADD TEXT LABELS TO COLUMNS
  if (panel.number()<=ncol) {
    lab <- gsub("\\.\\.", "\n", dataToPlot$variable[panel.number()])
    lab <- gsub("\\.", " ", lab)
    lab <- gsub("(ANES|GSS)", "(\\1)", lab)
    lab <- gsub("(\\d+)", "(\\1)", lab)
    grid.text(
      lab, 
      x    = 0.5, 
      y    = 1.3, 
      just = c('center', 'bottom'),
      gp   = gpar(font=1, lineheight=.95, cex=yAxisTextSize))    
  }

}


greyLegend.panel <- function(...) {   
  maxMissing <- max(dataToPlot$percent.missing, na.rm=TRUE)
  grid.colorstrip(gray(seq(1, 1 - maxMissing, len=256)), direction = 'horizontal')

  if (EDUC_DIFF) {
    panel.axis(
    at      = c(1, c(1.25, 1.5, 1.75)*30/31, 2), 
    labels  = c('0%\ndifference', '7%', '14%', '21%', '28%\ndifference'),
    rot     = 0.0,
    tck     = 0.5,    # tick length multiplier
    outside = TRUE)    
  }
  else {
    panel.axis(
      at      = seq(1, 2, length = 4), 
      labels  = c('0%\nmissing', '7%', '14%', '21%\nmissing'),
      rot     = 0.0,
      tck     = 0.5,  # tick length multiplier
      outside = TRUE)
  }
}

      

##############################################################################
# PRINT THE PDF FILE
##############################################################################
pdf(
  file   = paste0(dirOutput, filenameStem, '.pdf'), 
  width  = PS_width, 
  height = PS_height, 
  paper  = "special", 
  title  = PDFtitle,
  bg     = postscriptBackground)
tmp <- trellis.par.get('axis.components')
  tmp$left$pad1 <- 0.5  # distance between axis ticks and tick labels  
  tmp$top$pad1  <- 0
  tmp$top$pad2  <- 0
  trellis.par.set('axis.components', tmp)
tmp <- trellis.par.get('layout.heights')
  tmp$top.padding <- 0
  tmp$bottom.padding <- 0
  tmp$between <- 0
  trellis.par.set('layout.heights', tmp)
trellis.par.set('axis.line', list(
  alpha = 1, 
  col   = panelBorderCol,
  lty   = 1,
  lwd   = panelborderwidth))
  
  
# CREATE THE DIAGONAL-LINES PLOT
diagLines.plot <- xyplot(
  1 ~ 1, 
  panel    = function() grid.pattern(pattern=1, granularity=granularity, gp=gpar(lwd=0, lty=0)),
  strip    = FALSE,
  scales   = list(x=xaxs, y=yaxs),
  xlab     = '',
  ylab     = '')

# CREATE THE GRAY-COLORSTRIP LEGEND
greyLegend.plot <- xyplot(
  1:2 ~ 1:2, 
  panel    = greyLegend.panel,
  strip    = FALSE,
  scales   = list(x=xaxs, y=yaxs),
  xlab     = '',
  ylab     = '',
  par.settings = list(
    axis.line = list(
      alpha = 1, 
      col   = 'black',
      lty   = 1,
      lwd   = 0.5),
    clip = list(
      panel = "off",
      strip = "off")))
 
# CREATE THE MISSINGNESS MAP  
missingness.plot <- xyplot(
  percent.missing ~ percent.missing | panel.num, 
  data         = dataToPlot, 
  #type         = 'l',
  panel        = missingness.panel,
  as.table     = TRUE,                    # plot panels left to right, top to bottom
  layout       = panelLayout,
  ylim         = panel.ylim,
  between      = list(y=yBetween, x=xBetween),                   
  strip        = FALSE,
  scales       = list(x=xaxs, y=yaxs),
  xlab         = '',
  ylab         = '',
  par.settings = list(
    clip = list(
      panel = "off",
      strip = "off")))

# CREATE THE GRID LAYOUT AND VIEWPORTS
masterLayout <- grid.layout(
  # --panelHeight$x is the height of a "rectangle" for any individual year
  # --In nrow*panelHeight, "nrow" is defined as the number of years for which 
  #   I am plotting data.
  nrow    = 3,
  ncol    = 1,
  heights = unit(
    c(nrow*panelHeight$x, .25, panelHeight$x),   
    c("inches", "inches", "inches")),
  respect = matrix(c(1, 1, 1)))
vp1 <- viewport(layout.pos.row = 1,  name = "top")  
vp2 <- viewport(layout.pos.row = 2,  name = "spacer")    
vp3 <- viewport(layout.pos.row = 3,  name = "bottom")     


# PRINT THE FIGURE
grid.newpage()
pushViewport(
  vpTree(
    viewport(layout = masterLayout, name = "master"),
    vpList(vp1, vp2, vp3)))
seekViewport("master")
print(
  diagLines.plot, 
  panel.width  = list(x=ncol*panelWidth$x,  units=panelWidth$units), 
  panel.height = list(x=nrow*panelHeight$x, units=panelHeight$units),
  draw.in      = "top",
  more         = TRUE)
print(
  missingness.plot, 
  panel.width  = panelWidth, 
  panel.height = panelHeight, 
  draw.in      = "top")
print(
  greyLegend.plot, 
  panel.width  = list(x=ncol*panelWidth$x, units=panelWidth$units), 
  panel.height = list(.67*panelHeight$x,   units=panelHeight$units),
  draw.in      = "bottom")
dev.off()


if (!(Sys.which('pdfcrop')=='' | Sys.which('perl')=='')  |  'pdfcrop' %in% installed.packages()[, 'Package']) {  # if "pdfcrop" is installed 
  system(
    paste(
      if (Sys.info()['sysname']=='Windows') paste(Sys.getenv('Comspec'), '/c ') else "",  
      'pdfcrop', 
      paste0(dirOutput, filenameStem, '.pdf'), 
      paste0(dirOutput, filenameStem, '_crop.pdf')))
  file.remove(paste0(dirOutput, filenameStem, '.pdf'))
  if (interactive() & Sys.info()['sysname']=='Windows') shell.exec(paste0(normalizePath(dirOutput), filenameStem, '_crop.pdf'))
}